In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
In [2]:
df=pd.read_csv('marketing_campaign.csv')
print(df)
ID Year_Birth Education Marital_Status Income Kidhome \
0 5524 1957 Graduation Single 58138.0 0
1 2174 1954 Graduation Single 46344.0 1
2 4141 1965 Graduation Together 71613.0 0
3 6182 1984 Graduation Together 26646.0 1
4 5324 1981 PhD Married 58293.0 1
... ... ... ... ... ... ...
2235 10870 1967 Graduation Married 61223.0 0
2236 4001 1946 PhD Together 64014.0 2
2237 7270 1981 Graduation Divorced 56981.0 0
2238 8235 1956 Master Together 69245.0 0
2239 9405 1954 PhD Married 52869.0 1
Teenhome Dt_Customer Recency MntWines ... NumWebVisitsMonth \
0 0 4/9/2012 58 635 ... 7
1 1 8/3/2014 38 11 ... 5
2 0 21-08-2013 26 426 ... 4
3 0 10/2/2014 26 11 ... 6
4 0 19-01-2014 94 173 ... 5
... ... ... ... ... ... ...
2235 1 13-06-2013 46 709 ... 5
2236 1 10/6/2014 56 406 ... 7
2237 0 25-01-2014 91 908 ... 6
2238 1 24-01-2014 8 428 ... 3
2239 1 15-10-2012 40 84 ... 7
AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 \
0 0 0 0 0 0
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 0 0 0 0 0
... ... ... ... ... ...
2235 0 0 0 0 0
2236 0 0 0 1 0
2237 0 1 0 0 0
2238 0 0 0 0 0
2239 0 0 0 0 0
Complain Z_CostContact Z_Revenue Response
0 0 3 11 1
1 0 3 11 0
2 0 3 11 0
3 0 3 11 0
4 0 3 11 0
... ... ... ... ...
2235 0 3 11 0
2236 0 3 11 0
2237 0 3 11 0
2238 0 3 11 0
2239 0 3 11 1
[2240 rows x 29 columns]
In [3]:
df.head()
Out[3]:
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 4/9/2012 | 58 | 635 | ... | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
| 1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 8/3/2014 | 38 | 11 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 21-08-2013 | 26 | 426 | ... | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 10/2/2014 | 26 | 11 | ... | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 19-01-2014 | 94 | 173 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
5 rows × 29 columns
In [4]:
df.tail()
Out[4]:
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2235 | 10870 | 1967 | Graduation | Married | 61223.0 | 0 | 1 | 13-06-2013 | 46 | 709 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2236 | 4001 | 1946 | PhD | Together | 64014.0 | 2 | 1 | 10/6/2014 | 56 | 406 | ... | 7 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 11 | 0 |
| 2237 | 7270 | 1981 | Graduation | Divorced | 56981.0 | 0 | 0 | 25-01-2014 | 91 | 908 | ... | 6 | 0 | 1 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2238 | 8235 | 1956 | Master | Together | 69245.0 | 0 | 1 | 24-01-2014 | 8 | 428 | ... | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2239 | 9405 | 1954 | PhD | Married | 52869.0 | 1 | 1 | 15-10-2012 | 40 | 84 | ... | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
5 rows × 29 columns
In [5]:
df.shape
Out[5]:
(2240, 29)
In [6]:
df.isnull().sum()
Out[6]:
ID 0 Year_Birth 0 Education 0 Marital_Status 0 Income 24 Kidhome 0 Teenhome 0 Dt_Customer 0 Recency 0 MntWines 0 MntFruits 0 MntMeatProducts 0 MntFishProducts 0 MntSweetProducts 0 MntGoldProds 0 NumDealsPurchases 0 NumWebPurchases 0 NumCatalogPurchases 0 NumStorePurchases 0 NumWebVisitsMonth 0 AcceptedCmp3 0 AcceptedCmp4 0 AcceptedCmp5 0 AcceptedCmp1 0 AcceptedCmp2 0 Complain 0 Z_CostContact 0 Z_Revenue 0 Response 0 dtype: int64
In [7]:
df.dropna(inplace=True)
In [8]:
df.isnull().sum()
Out[8]:
ID 0 Year_Birth 0 Education 0 Marital_Status 0 Income 0 Kidhome 0 Teenhome 0 Dt_Customer 0 Recency 0 MntWines 0 MntFruits 0 MntMeatProducts 0 MntFishProducts 0 MntSweetProducts 0 MntGoldProds 0 NumDealsPurchases 0 NumWebPurchases 0 NumCatalogPurchases 0 NumStorePurchases 0 NumWebVisitsMonth 0 AcceptedCmp3 0 AcceptedCmp4 0 AcceptedCmp5 0 AcceptedCmp1 0 AcceptedCmp2 0 Complain 0 Z_CostContact 0 Z_Revenue 0 Response 0 dtype: int64
In [9]:
df
Out[9]:
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 4/9/2012 | 58 | 635 | ... | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
| 1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 8/3/2014 | 38 | 11 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 21-08-2013 | 26 | 426 | ... | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 10/2/2014 | 26 | 11 | ... | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 19-01-2014 | 94 | 173 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | 10870 | 1967 | Graduation | Married | 61223.0 | 0 | 1 | 13-06-2013 | 46 | 709 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2236 | 4001 | 1946 | PhD | Together | 64014.0 | 2 | 1 | 10/6/2014 | 56 | 406 | ... | 7 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 11 | 0 |
| 2237 | 7270 | 1981 | Graduation | Divorced | 56981.0 | 0 | 0 | 25-01-2014 | 91 | 908 | ... | 6 | 0 | 1 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2238 | 8235 | 1956 | Master | Together | 69245.0 | 0 | 1 | 24-01-2014 | 8 | 428 | ... | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2239 | 9405 | 1954 | PhD | Married | 52869.0 | 1 | 1 | 15-10-2012 | 40 | 84 | ... | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
2216 rows × 29 columns
In [10]:
print(df.duplicated().sum())
0
In [11]:
df.columns
Out[11]:
Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response'],
dtype='object')
In [12]:
df.columns=df.columns.str.strip()
df
Out[12]:
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 4/9/2012 | 58 | 635 | ... | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
| 1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 8/3/2014 | 38 | 11 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 21-08-2013 | 26 | 426 | ... | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 10/2/2014 | 26 | 11 | ... | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 19-01-2014 | 94 | 173 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | 10870 | 1967 | Graduation | Married | 61223.0 | 0 | 1 | 13-06-2013 | 46 | 709 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2236 | 4001 | 1946 | PhD | Together | 64014.0 | 2 | 1 | 10/6/2014 | 56 | 406 | ... | 7 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 11 | 0 |
| 2237 | 7270 | 1981 | Graduation | Divorced | 56981.0 | 0 | 0 | 25-01-2014 | 91 | 908 | ... | 6 | 0 | 1 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2238 | 8235 | 1956 | Master | Together | 69245.0 | 0 | 1 | 24-01-2014 | 8 | 428 | ... | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2239 | 9405 | 1954 | PhD | Married | 52869.0 | 1 | 1 | 15-10-2012 | 40 | 84 | ... | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
2216 rows × 29 columns
In [13]:
df.dtypes
Out[13]:
ID int64 Year_Birth int64 Education object Marital_Status object Income float64 Kidhome int64 Teenhome int64 Dt_Customer object Recency int64 MntWines int64 MntFruits int64 MntMeatProducts int64 MntFishProducts int64 MntSweetProducts int64 MntGoldProds int64 NumDealsPurchases int64 NumWebPurchases int64 NumCatalogPurchases int64 NumStorePurchases int64 NumWebVisitsMonth int64 AcceptedCmp3 int64 AcceptedCmp4 int64 AcceptedCmp5 int64 AcceptedCmp1 int64 AcceptedCmp2 int64 Complain int64 Z_CostContact int64 Z_Revenue int64 Response int64 dtype: object
In [14]:
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 2216 entries, 0 to 2239 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 2216 non-null int64 1 Year_Birth 2216 non-null int64 2 Education 2216 non-null object 3 Marital_Status 2216 non-null object 4 Income 2216 non-null float64 5 Kidhome 2216 non-null int64 6 Teenhome 2216 non-null int64 7 Dt_Customer 2216 non-null object 8 Recency 2216 non-null int64 9 MntWines 2216 non-null int64 10 MntFruits 2216 non-null int64 11 MntMeatProducts 2216 non-null int64 12 MntFishProducts 2216 non-null int64 13 MntSweetProducts 2216 non-null int64 14 MntGoldProds 2216 non-null int64 15 NumDealsPurchases 2216 non-null int64 16 NumWebPurchases 2216 non-null int64 17 NumCatalogPurchases 2216 non-null int64 18 NumStorePurchases 2216 non-null int64 19 NumWebVisitsMonth 2216 non-null int64 20 AcceptedCmp3 2216 non-null int64 21 AcceptedCmp4 2216 non-null int64 22 AcceptedCmp5 2216 non-null int64 23 AcceptedCmp1 2216 non-null int64 24 AcceptedCmp2 2216 non-null int64 25 Complain 2216 non-null int64 26 Z_CostContact 2216 non-null int64 27 Z_Revenue 2216 non-null int64 28 Response 2216 non-null int64 dtypes: float64(1), int64(25), object(3) memory usage: 519.4+ KB
In [15]:
df.describe()
Out[15]:
| ID | Year_Birth | Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | ... | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.0 | 2216.0 | 2216.000000 |
| mean | 5588.353339 | 1968.820397 | 52247.251354 | 0.441787 | 0.505415 | 49.012635 | 305.091606 | 26.356047 | 166.995939 | 37.637635 | ... | 5.319043 | 0.073556 | 0.074007 | 0.073105 | 0.064079 | 0.013538 | 0.009477 | 3.0 | 11.0 | 0.150271 |
| std | 3249.376275 | 11.985554 | 25173.076661 | 0.536896 | 0.544181 | 28.948352 | 337.327920 | 39.793917 | 224.283273 | 54.752082 | ... | 2.425359 | 0.261106 | 0.261842 | 0.260367 | 0.244950 | 0.115588 | 0.096907 | 0.0 | 0.0 | 0.357417 |
| min | 0.000000 | 1893.000000 | 1730.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| 25% | 2814.750000 | 1959.000000 | 35303.000000 | 0.000000 | 0.000000 | 24.000000 | 24.000000 | 2.000000 | 16.000000 | 3.000000 | ... | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| 50% | 5458.500000 | 1970.000000 | 51381.500000 | 0.000000 | 0.000000 | 49.000000 | 174.500000 | 8.000000 | 68.000000 | 12.000000 | ... | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| 75% | 8421.750000 | 1977.000000 | 68522.000000 | 1.000000 | 1.000000 | 74.000000 | 505.000000 | 33.000000 | 232.250000 | 50.000000 | ... | 7.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| max | 11191.000000 | 1996.000000 | 666666.000000 | 2.000000 | 2.000000 | 99.000000 | 1493.000000 | 199.000000 | 1725.000000 | 259.000000 | ... | 20.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 3.0 | 11.0 | 1.000000 |
8 rows × 26 columns
In [16]:
df.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True)
In [17]:
df
Out[17]:
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 4/9/2012 | 58 | 635 | ... | 10 | 4 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 8/3/2014 | 38 | 11 | ... | 1 | 2 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 21-08-2013 | 26 | 426 | ... | 2 | 10 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 10/2/2014 | 26 | 11 | ... | 0 | 4 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 19-01-2014 | 94 | 173 | ... | 3 | 6 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | 10870 | 1967 | Graduation | Married | 61223.0 | 0 | 1 | 13-06-2013 | 46 | 709 | ... | 3 | 4 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2236 | 4001 | 1946 | PhD | Together | 64014.0 | 2 | 1 | 10/6/2014 | 56 | 406 | ... | 2 | 5 | 7 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2237 | 7270 | 1981 | Graduation | Divorced | 56981.0 | 0 | 0 | 25-01-2014 | 91 | 908 | ... | 3 | 13 | 6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 2238 | 8235 | 1956 | Master | Together | 69245.0 | 0 | 1 | 24-01-2014 | 8 | 428 | ... | 5 | 10 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2239 | 9405 | 1954 | PhD | Married | 52869.0 | 1 | 1 | 15-10-2012 | 40 | 84 | ... | 1 | 4 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2216 rows × 27 columns
In [18]:
df['Age']=2024-df['Year_Birth']
df
Out[18]:
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Response | Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 4/9/2012 | 58 | 635 | ... | 4 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 67 |
| 1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 8/3/2014 | 38 | 11 | ... | 2 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 70 |
| 2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 21-08-2013 | 26 | 426 | ... | 10 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 59 |
| 3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 10/2/2014 | 26 | 11 | ... | 4 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 40 |
| 4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 19-01-2014 | 94 | 173 | ... | 6 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 43 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | 10870 | 1967 | Graduation | Married | 61223.0 | 0 | 1 | 13-06-2013 | 46 | 709 | ... | 4 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 57 |
| 2236 | 4001 | 1946 | PhD | Together | 64014.0 | 2 | 1 | 10/6/2014 | 56 | 406 | ... | 5 | 7 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 78 |
| 2237 | 7270 | 1981 | Graduation | Divorced | 56981.0 | 0 | 0 | 25-01-2014 | 91 | 908 | ... | 13 | 6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 43 |
| 2238 | 8235 | 1956 | Master | Together | 69245.0 | 0 | 1 | 24-01-2014 | 8 | 428 | ... | 10 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 68 |
| 2239 | 9405 | 1954 | PhD | Married | 52869.0 | 1 | 1 | 15-10-2012 | 40 | 84 | ... | 4 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 70 |
2216 rows × 28 columns
In [19]:
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format='mixed')
In [20]:
df
Out[20]:
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Response | Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 2012-04-09 | 58 | 635 | ... | 4 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 67 |
| 1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 2014-08-03 | 38 | 11 | ... | 2 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 70 |
| 2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 2013-08-21 | 26 | 426 | ... | 10 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 59 |
| 3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 2014-10-02 | 26 | 11 | ... | 4 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 40 |
| 4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 2014-01-19 | 94 | 173 | ... | 6 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 43 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | 10870 | 1967 | Graduation | Married | 61223.0 | 0 | 1 | 2013-06-13 | 46 | 709 | ... | 4 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 57 |
| 2236 | 4001 | 1946 | PhD | Together | 64014.0 | 2 | 1 | 2014-10-06 | 56 | 406 | ... | 5 | 7 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 78 |
| 2237 | 7270 | 1981 | Graduation | Divorced | 56981.0 | 0 | 0 | 2014-01-25 | 91 | 908 | ... | 13 | 6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 43 |
| 2238 | 8235 | 1956 | Master | Together | 69245.0 | 0 | 1 | 2014-01-24 | 8 | 428 | ... | 10 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 68 |
| 2239 | 9405 | 1954 | PhD | Married | 52869.0 | 1 | 1 | 2012-10-15 | 40 | 84 | ... | 4 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 70 |
2216 rows × 28 columns
In [21]:
df.value_counts('Education')
Out[21]:
Education Graduation 1116 PhD 481 Master 365 2n Cycle 200 Basic 54 Name: count, dtype: int64
In [22]:
plt.figure(figsize=(14,7))
sns.countplot(x='Education',data=df)
plt.xlabel('Education')
plt.show()
In [23]:
fig = px.bar(df, x = 'Education', y = "Income", color = "Education", title = "Education vs Income")
fig.update_layout(xaxis = {'categoryorder':'total descending'})
fig.show()
In [24]:
df.value_counts('Marital_Status')
Out[24]:
Marital_Status Married 857 Together 573 Single 471 Divorced 232 Widow 76 Alone 3 Absurd 2 YOLO 2 Name: count, dtype: int64
In [25]:
plt.figure(figsize=(14,7))
sns.countplot(x='Marital_Status',data=df)
plt.xlabel('Marital_Status')
plt.show()
In [26]:
fig = px.bar(df, x = 'Marital_Status', y = "Income", color = "Marital_Status", title = "Marital_Status vs Income")
fig.update_layout(xaxis = {'categoryorder':'total descending'})
fig.show()
In [27]:
df.value_counts('AcceptedCmp1')
Out[27]:
AcceptedCmp1 0 2074 1 142 Name: count, dtype: int64
In [28]:
df.value_counts('AcceptedCmp2')
Out[28]:
AcceptedCmp2 0 2186 1 30 Name: count, dtype: int64
In [29]:
df.value_counts('AcceptedCmp3')
Out[29]:
AcceptedCmp3 0 2053 1 163 Name: count, dtype: int64
In [30]:
df.value_counts('AcceptedCmp4')
Out[30]:
AcceptedCmp4 0 2052 1 164 Name: count, dtype: int64
In [31]:
df.value_counts('AcceptedCmp5')
Out[31]:
AcceptedCmp5 0 2054 1 162 Name: count, dtype: int64
In [32]:
df.value_counts('Response')
Out[32]:
Response 0 1883 1 333 Name: count, dtype: int64
In [33]:
df.groupby("Marital_Status")["Recency"].sum().plot(kind = 'pie' ,autopct='%1.1f%%')
plt.title('Distribution of Recency by Marital Status')
plt.show()
In [34]:
plt.figure(figsize=(14,7))
sns.countplot(x='Kidhome',data=df,palette='Set1')
plt.xlabel('Kidhome')
plt.show()
In [35]:
plt.figure(figsize=(14,7))
sns.countplot(x='Teenhome',data=df, palette="Set2")
plt.xlabel('Teenhome')
plt.show()
In [36]:
df = df[df['Age'] <= 100]
df
Out[36]:
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Response | Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 2012-04-09 | 58 | 635 | ... | 4 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 67 |
| 1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 2014-08-03 | 38 | 11 | ... | 2 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 70 |
| 2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 2013-08-21 | 26 | 426 | ... | 10 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 59 |
| 3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 2014-10-02 | 26 | 11 | ... | 4 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 40 |
| 4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 2014-01-19 | 94 | 173 | ... | 6 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 43 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | 10870 | 1967 | Graduation | Married | 61223.0 | 0 | 1 | 2013-06-13 | 46 | 709 | ... | 4 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 57 |
| 2236 | 4001 | 1946 | PhD | Together | 64014.0 | 2 | 1 | 2014-10-06 | 56 | 406 | ... | 5 | 7 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 78 |
| 2237 | 7270 | 1981 | Graduation | Divorced | 56981.0 | 0 | 0 | 2014-01-25 | 91 | 908 | ... | 13 | 6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 43 |
| 2238 | 8235 | 1956 | Master | Together | 69245.0 | 0 | 1 | 2014-01-24 | 8 | 428 | ... | 10 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 68 |
| 2239 | 9405 | 1954 | PhD | Married | 52869.0 | 1 | 1 | 2012-10-15 | 40 | 84 | ... | 4 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 70 |
2213 rows × 28 columns
In [37]:
age_counts=df['Age'].value_counts().sort_values(ascending=False)
plt.figure(figsize=(14,7))
sns.countplot(x='Age',data=df,order=age_counts.index)
plt.xlabel('Age')
plt.xticks(rotation=90)
plt.show()
In [38]:
sns.scatterplot(x='Age' ,y= 'Recency' ,data=df)
plt.title("Scatter Plot of Age vs. Recency")
plt.show()
In [39]:
plt.figure(figsize=(14,7))
sns.countplot(x='NumDealsPurchases',data=df)
plt.xlabel('NumDealsPurchases')
plt.show()
In [40]:
plt.figure(figsize=(14,7))
sns.countplot(x='NumStorePurchases',data=df)
plt.xlabel('NumStorePurchases')
plt.show()
In [41]:
plt.figure(figsize=(14,7))
sns.countplot(x='NumWebPurchases',data=df)
plt.xlabel('NumWebPurchases')
plt.show()
In [42]:
plt.figure(figsize=(14,7))
sns.countplot(x='NumCatalogPurchases',data=df)
plt.xlabel('NumCatalogPurchases')
plt.show()
In [43]:
plt.figure(figsize=(14,7))
sns.barplot(x='Marital_Status',y='NumStorePurchases',data=df, errorbar=('ci',0))
plt.xlabel('Marital_Status')
plt.ylabel('NumStorePurchases')
plt.title('Marital_Status vs NumStorePurchases')
plt.xticks(rotation=45)
plt.show()
In [44]:
plt.figure(figsize=(14,7))
sns.barplot(x='Marital_Status',y='NumWebPurchases',data=df, errorbar=('ci',0))
plt.xlabel('Marital_Status')
plt.ylabel('NumWebPurchases')
plt.title('Marital_Status vs NumWebPurchases')
plt.xticks(rotation=45)
plt.show()
In [45]:
plt.figure(figsize=(14,7))
sns.barplot(x='Marital_Status',y='NumCatalogPurchases',data=df, errorbar=('ci',0))
plt.xlabel('Marital_Status')
plt.ylabel('NumCatalogPurchases')
plt.title('Marital_Status vs NumCatalogPurchases')
plt.xticks(rotation=45)
plt.show()
In [46]:
plt.figure(figsize=(14,7))
sns.barplot(x='Marital_Status',y='NumWebVisitsMonth',data=df, errorbar=('ci',0))
plt.xlabel('Marital_Status')
plt.ylabel('NumWebVisitsMonth')
plt.title('Marital_Status vs NumWebVisitsMonth')
plt.xticks(rotation=45)
plt.show()
In [47]:
total_sum = df["NumDealsPurchases"].sum()
print(f"Total sum of NumDealsPurchases: {total_sum}")
Total sum of NumDealsPurchases: 5146
In [48]:
total_sum = df["NumWebPurchases"].sum()
print(f"Total sum of NumWebPurchases: {total_sum}")
Total sum of NumWebPurchases: 9046
In [49]:
total_sum = df["NumCatalogPurchases"].sum()
print(f"Total sum of NumCatalogPurchases: {total_sum}")
Total sum of NumCatalogPurchases: 5912
In [50]:
total_sum = df["NumStorePurchases"].sum()
print(f"Total sum of NumStorePurchases: {total_sum}")
Total sum of NumStorePurchases: 12847
In [51]:
total_sum = df["NumWebVisitsMonth"].sum()
print(f"Total sum of NumWebVisitsMonth: {total_sum}")
Total sum of NumWebVisitsMonth: 11777
In [52]:
data = df.groupby("Marital_Status")["MntMeatProducts"].sum().reset_index()
fig = px.pie(data, values='MntMeatProducts', names='Marital_Status', title='Meat Products Sum by Marital Status')
fig.show()
In [53]:
data = df.groupby("Marital_Status")["MntFruits"].sum().reset_index()
fig = px.pie(data, values='MntFruits', names='Marital_Status', title='Fruit Products Sum by Marital Status')
fig.show()
In [54]:
data = df.groupby("Marital_Status")["MntWines"].sum().reset_index()
fig = px.pie(data, values='MntWines', names='Marital_Status', title='wine Products Sum by Marital Status')
fig.show()
In [55]:
data = df.groupby("Marital_Status")["MntFishProducts"].sum().reset_index()
fig = px.pie(data, values='MntFishProducts', names='Marital_Status', title='Fish Products Sum by Marital Status')
fig.show()
In [56]:
data = df.groupby("Marital_Status")["MntSweetProducts"].sum().reset_index()
fig = px.pie(data, values='MntSweetProducts', names='Marital_Status', title='Sweet Products Sum by Marital Status')
fig.show()
In [57]:
data = df.groupby("Marital_Status")["MntGoldProds"].sum().reset_index()
fig = px.pie(data, values='MntGoldProds', names='Marital_Status', title='Gold Products Sum by Marital Status')
fig.show()
In [58]:
correlation = df[['Income','Kidhome','Teenhome','Recency','MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts',
'MntGoldProds','NumDealsPurchases','NumWebPurchases','NumCatalogPurchases','NumStorePurchases','NumWebVisitsMonth']].corr()
plt.figure(figsize=(10,8))
sns.heatmap(correlation, cmap='coolwarm')
for i in range(correlation.shape[0]):
for j in range(correlation.shape[1]):
plt.text(j+0.5, i+0.5, "{:.2f}".format(correlation.iloc[i, j]), ha='center', va='center', color='white')
plt.title('Correlation Heatmap')
plt.show()
In [59]:
fig = px.sunburst(df, path=['Education', 'Marital_Status'], values='Income', title='Sunburst Plot of Income by Education and Marital Status')
fig.show()
In [60]:
complain_counts = df['Complain'].value_counts()
plt.figure(figsize=(8,8))
complain_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title('Distribution of Complains')
plt.ylabel('')
plt.show()
In [61]:
df.value_counts('Complain')
Out[61]:
Complain 0 2193 1 20 Name: count, dtype: int64
In [62]:
df.groupby("Education")["Complain"].sum().plot(kind = 'pie' ,autopct='%1.1f%%')
plt.title('Distribution of Complaints by Education')
plt.ylabel('')
plt.show()
In [63]:
df.groupby("Marital_Status")["Complain"].sum().plot(kind = 'pie' ,autopct='%1.1f%%')
plt.title('Distribution of Complaints by Marital_Status')
plt.show()
In [64]:
df.columns
Out[64]:
Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
'AcceptedCmp2', 'Complain', 'Response', 'Age'],
dtype='object')
In [65]:
total_sum = df["MntWines"].sum()
print(f"Total sum of MntWines: {total_sum}")
Total sum of MntWines: 675305
In [66]:
total_sum = df["MntFruits"].sum()
print(f"Total sum of MntFruits: {total_sum}")
Total sum of MntFruits: 58255
In [67]:
total_sum = df["MntMeatProducts"].sum()
print(f"Total sum of MntMeatProducts: {total_sum}")
Total sum of MntMeatProducts: 369488
In [68]:
total_sum = df["MntFishProducts"].sum()
print(f"Total sum of MntFishProducts: {total_sum}")
Total sum of MntFishProducts: 83287
In [69]:
total_sum = df["MntSweetProducts"].sum()
print(f"Total sum of MntSweetProducts: {total_sum}")
Total sum of MntSweetProducts: 59828
In [70]:
total_sum = df["MntGoldProds"].sum()
print(f"Total sum of MntGoldProds: {total_sum}")
Total sum of MntGoldProds: 97176
Conclusion:¶
- The job with highest salary is Data Analytics lead.
- The job with second highest salary is Principal Data Engineer.
- The Employee who have Expert level experience gets the highest salary.
- The Employee with remote ratio is 100 has highest salary.
- The most common employment type here is full-time.
- The majority of people work in medium-sized companies.
- In 2022, Salary has increased than in 2021 and 2020.
- The most prevalent employee level is Senior Level.
- The Employee who has remote ratio 100 are the highest.
- Most of the workers residence is in US.
- Employee's who are working in a company located at RU gets the highest salary and the second is US.
- Most of the Employees are working in a company which are located at US.
- Most of the Employees are working as Data Scientist and Data Engineer.
- Employee's whose residence is in MY gets the highest salary.
- The majority of people work as full time gets the highest salary.
- In 2021, who had mid-level experience got highest salary.
- Employee who work in a large company gets the highest salary.
- The majority of people who has senior level experience receives highest salary.